First we import the required packages and load the cleaned.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import cufflinks as cf
from tqdm import tqdm
import chart_studio
import re
import urllib.request
import os
chart_studio.tools.set_credentials_file(username='zhyiyang', api_key='bwSqRbNvLD1oZ8xYIiQF')
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode()
data = pd.read_csv("cleaned_stock/600000.csv")
Target: return of the stock(daily) is already contained in variable "change_rate"
In this part, we add technical analysis features to the data.
def add_features(data):
# 10 day's return
data["10d_r"] = (data["close"] - data["close"].shift(10))/data["close"].shift(10)
# typical/adjusted close price
data["typical_P"] = (data["high"] + data["low"] + data["close"])/3
# 5, 10 and 25 day period momentum
data["5mom"] = data["typical_P"] - data["typical_P"].shift(5)
data["10mom"] = data["typical_P"] - data["typical_P"].shift(10)
data["25mom"] = data["typical_P"] - data["typical_P"].shift(25)
# RSI10
data["RSI_10"] = np.NaN
for i in range(len(data)-10):
count = 0
for k in range(10):
if data["close"].loc[i+k] > data["pre-close"].loc[i+k]:
count += 1
data["RSI_10"].loc[i+10] = count/10*100
# 5 day VWAP
data["mid_V"] = data["typical_P"] * data["volume"]
data["upvwap"] = data["mid_V"].rolling(5).sum()
data["VWAP"] = data["upvwap"]/data["volume"].rolling(5).sum()
# 5 day DUVOL
data["count"] = np.NaN
data["change_r_p"] = np.NaN
data["change_r_d"] = np.NaN
data["mr_60"] = data["change_rate"].rolling(60).mean()
for i in range(len(data)):
if data["close"].loc[i] >= data["pre-close"].loc[i]:
data["change_r_p"].loc[i] = data["change_rate"].loc[i]
data["change_r_d"].loc[i] = data["mr_60"].loc[i]
else:
data["change_r_p"].loc[i] = data["mr_60"].loc[i]
data["change_r_d"].loc[i] = data["change_rate"].loc[i]
data["mr_sq_p"] = (data["change_r_p"] - data["mr_60"])**2
data["mr_sq_d"] = (data["change_r_d"] - data["mr_60"])**2
for i in range(len(data)-25):
count = 0
for k in range(25):
if data["close"].loc[i+k] > data["pre-close"].loc[i+k]:
count += 1
data["count"].loc[i+25] = count
data["DUVOL"] = np.log((data["count"]-1)*data["mr_sq_d"].rolling(25).sum()/(25-data["count"]-1)*data["mr_sq_p"].rolling(25).sum())
data["DUVOL"].replace(-float('inf'), -100)
data["DUVOL"].replace(np.NaN, -100)
# 5, 10, 25 Moving average
data["MA_5"] = data["close"].rolling(5).mean()
data["MA_10"] = data["close"].rolling(10).mean()
data["MA_25"] = data["close"].rolling(25).mean()
# drop useless columns for future steps
data.drop(["typical_P", "mid_V", "upvwap", "mr_60", "mr_sq_p", "mr_sq_d",
"change_r_p", "change_r_d", "count"], axis=1, inplace=True)
return data
add_features(data)
data.head()
| time | code | close | high | low | open | pre-close | change_amount | change_rate | turnover_rate | ... | 10d_r | 5mom | 10mom | 25mom | RSI_10 | VWAP | DUVOL | MA_5 | MA_10 | MA_25 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2015-01-05 | 600000 | 16.07 | 16.25 | 15.56 | 15.88 | 15.69 | 0.38 | 2.4219 | 3.4415 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 2015-01-06 | 600000 | 16.13 | 16.68 | 15.82 | 16.00 | 16.07 | 0.06 | 0.3734 | 3.4289 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 2015-01-07 | 600000 | 15.81 | 16.17 | 15.53 | 15.90 | 16.13 | -0.32 | -1.9839 | 2.5848 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 2015-01-08 | 600000 | 15.25 | 15.88 | 15.20 | 15.87 | 15.81 | -0.56 | -3.5421 | 2.2156 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 2015-01-09 | 600000 | 15.43 | 16.25 | 15.11 | 15.20 | 15.25 | 0.18 | 1.1803 | 3.2970 | ... | NaN | NaN | NaN | NaN | NaN | 15.839461 | NaN | 15.738 | NaN | NaN |
5 rows × 24 columns
feature_list = ['change_rate', 'turnover_rate', 'volume',
'amount', 'market_value', '10d_r', '5mom', '10mom', '25mom',
'RSI_10', 'VWAP', 'DUVOL', 'MA_5', 'MA_10', 'MA_25']
import seaborn as sns
fig=plt.figure(figsize=(15, 10), dpi=80)
count=1
for feature in feature_list:
ax=fig.add_subplot(3, 5, count)
sns.violinplot(ax=ax, data=data[feature])
ax.set_title(feature)
count += 1
fig.suptitle('Summary Plot for Features', size=25)
fig.tight_layout()
plt.show()
data = data
data = data.set_index("time", drop=True)
qf = cf.QuantFig(data,title=f'600000.SH',legend='top',name='GS')
qf.add_volume(name='Volume',up_color='green', down_color='red')
qf.add_macd(name="MACD")
qf.iplot()
index_component_start = pd.read_excel("data/20150101-20150520.xlsx", sheet_name="行业")
stocks = index_component_start["代码"].to_list()
index_component_change = pd.read_excel("data/index_adjustment.xlsx")
stocks_ch = index_component_change["代码"].to_list()
index_component = list(set(stocks + stocks_ch))
print(index_component)
['600372.SH', '600547.SH', '601601.SH', '603993.SH', '601800.SH', '600703.SH', '600606.SH', '600276.SH', '600438.SH', '601658.SH', '600100.SH', '601995.SH', '601688.SH', '603501.SH', '600583.SH', '601118.SH', '601816.SH', '600832.SH', '600089.SH', '600030.SH', '601012.SH', '601633.SH', '601878.SH', '601336.SH', '600309.SH', '601727.SH', '600585.SH', '601328.SH', '600009.SH', '600015.SH', '601288.SH', '600332.SH', '600031.SH', '600637.SH', '600436.SH', '600570.SH', '600900.SH', '603986.SH', '601668.SH', '600196.SH', '601818.SH', '600000.SH', '600111.SH', '603160.SH', '601211.SH', '600019.SH', '600029.SH', '600104.SH', '600406.SH', '601888.SH', '601236.SH', '600485.SH', '600028.SH', '600036.SH', '603288.SH', '601166.SH', '600150.SH', '601788.SH', '600018.SH', '601088.SH', '601360.SH', '601318.SH', '600518.SH', '601939.SH', '601628.SH', '600919.SH', '601299.SH', '601319.SH', '601901.SH', '601881.SH', '601186.SH', '600837.SH', '600918.SH', '601169.SH', '601669.SH', '600588.SH', '601006.SH', '600048.SH', '600809.SH', '601198.SH', '603259.SH', '601728.SH', '600016.SH', '601766.SH', '600340.SH', '600893.SH', '601988.SH', '601390.SH', '600519.SH', '601919.SH', '601111.SH', '600795.SH', '601377.SH', '601229.SH', '600690.SH', '601985.SH', '600109.SH', '601899.SH', '600050.SH', '600256.SH', '601857.SH', '600010.SH', '601138.SH', '600958.SH', '601066.SH', '600745.SH', '601398.SH', '600999.SH', '601989.SH', '600887.SH', '601998.SH']
for com in tqdm(index_component):
data = pd.read_csv(f"cleaned_stock/{com[:6]}.csv")
data_f = add_features(data)
data_f.to_csv(f"featured_stock/{com[:6]}.csv", index=False)
1%| | 1/111 [00:03<07:00, 3.82s/it]C:\Users\15715\anaconda3\lib\site-packages\pandas\core\arraylike.py:358: RuntimeWarning: divide by zero encountered in log C:\Users\15715\anaconda3\lib\site-packages\pandas\core\arraylike.py:358: RuntimeWarning: invalid value encountered in log 100%|██████████| 111/111 [06:21<00:00, 3.44s/it]